#read in manhattan rides data
manhattan_rides_df <- read_csv("manhattan_rides.csv")
# the code below saves trip_min, gender, age_group variables of manhattan_rides_df
# and converts age_group and gender to factors
trip_dur_age_gender_df <-
manhattan_rides_df %>%
select(trip_min, gender, age_group) %>%
mutate(
age_group = factor(age_group, ordered = T,
levels = c("18-25","26-35", "36-45", "46-55", "56-65", "66-85")),
gender = type.convert(gender, as.is = F)
)
# looking at the data we can tell there are massive outliers in trip duration
# so i am going to filter them out using the IQR method
Q1 <- quantile(pull(trip_dur_age_gender_df, trip_min), probs = 0.25)
Q3 <- quantile(pull(trip_dur_age_gender_df, trip_min), probs = 0.75)
inter_quart <- IQR(pull(trip_dur_age_gender_df, trip_min))
trip_dur_age_gender_df <-
trip_dur_age_gender_df %>%
filter(
trip_min >= Q1 - 1.5*inter_quart,
trip_min <= Q3 + 1.5*inter_quart
)
#this generates a plot of boxplots of trip duration (minutes) by gender and age_group
trip_dur_age_gender_df %>%
mutate(
gender = str_to_sentence(gender)
) %>%
plot_ly(x = ~age_group, y = ~trip_min, color = ~gender, type = "box", colors = "viridis") %>%
layout(
boxmode = "group",
xaxis = list(title = "Age Range"),
yaxis = list(title = "Trip Duration (min)"),
legend = list(title = list(text = "<b> Gender </b>"))
)
# this dataframe groups rides by gender and month
# and provides the average age for each gender in that month
# along with the standard deviation, standard error, lower bound, and upper bound
# which are then used to create a plotly graph where we get average age per month for each gender
# with 95% confidence bands around each line
avg_age_per_month_df <-
read_csv("manhattan_rides.csv") %>%
mutate(
date = floor_date(as_date(starttime), "month")
) %>%
select(date, gender, age) %>%
group_by(date, gender) %>%
summarize(
total = n(),
avg_age = mean(age),
sd_age = sd(age)
) %>%
mutate(
sem = sd_age/sqrt(total - 1),
lower_bound = avg_age + qt(0.025, df = total - 1) * sem,
upper_bound = avg_age - qt(0.025, df = total - 1) * sem
) %>%
ungroup()
avg_age_plot <-
avg_age_per_month_df %>%
mutate(
gender = str_to_sentence(gender)
) %>%
ggplot(aes(x = date, y = avg_age, color = gender)) +
geom_line(size = 1, alpha = 0.8) +
geom_ribbon(aes(ymin = lower_bound, ymax = upper_bound), alpha = 0.2)
ggplotly(avg_age_plot) %>%
layout(
xaxis = list(title = "Date"),
yaxis = list(title = "Age"),
legend = list(title = list(text = "<b> Gender </b>"))
)